Use multiple feature extractors (on the same data), concatenate results.
In [ ]:
from sklearn.pipeline import make_union, make_pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.grid_search import GridSearchCV
import numpy as np
In [ ]:
from sklearn.datasets import fetch_20newsgroups
In [ ]:
news = fetch_20newsgroups()
In [ ]:
data, y = news.data, news.target
In [ ]:
from sklearn.cross_validation import train_test_split
data_train, data_test, y_train, y_test = train_test_split(data, y)
In [ ]:
char_and_word = make_union(CountVectorizer(analyzer="char"),
CountVectorizer(analyzer="word"))
text_pipe = make_pipeline(char_and_word, LinearSVC(dual=False))
param_grid = {'linearsvc__C': 10. ** np.arange(-3, 3)}
grid = GridSearchCV(text_pipe, param_grid=param_grid, cv=5, verbose=10)
In [ ]:
grid.fit(data_train, y_train)
In [ ]:
param_grid = {'featureunion__countvectorizer-1__ngram_range': [(1, 3), (1, 5), (2, 5)],
'featureunion__countvectorizer-2__ngram_range': [(1, 1), (1, 2), (2, 2)],
'linearsvc__C': 10. ** np.arange(-3, 3)}